import pandas as pd
import math
import tabulate as tb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import missingno as msn
import plotly.express as px
import seaborn as sns
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
df=pd.read_csv('water_potability.csv')
######## DATA VISUALIZATION #####
df
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | 204.890455 | 20791.318981 | 7.300212 | 368.516441 | 564.308654 | 10.379783 | 86.990970 | 2.963135 | 0 |
| 1 | 3.716080 | 129.422921 | 18630.057858 | 6.635246 | NaN | 592.885359 | 15.180013 | 56.329076 | 4.500656 | 0 |
| 2 | 8.099124 | 224.236259 | 19909.541732 | 9.275884 | NaN | 418.606213 | 16.868637 | 66.420093 | 3.055934 | 0 |
| 3 | 8.316766 | 214.373394 | 22018.417441 | 8.059332 | 356.886136 | 363.266516 | 18.436524 | 100.341674 | 4.628771 | 0 |
| 4 | 9.092223 | 181.101509 | 17978.986339 | 6.546600 | 310.135738 | 398.410813 | 11.558279 | 31.997993 | 4.075075 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3271 | 4.668102 | 193.681735 | 47580.991603 | 7.166639 | 359.948574 | 526.424171 | 13.894419 | 66.687695 | 4.435821 | 1 |
| 3272 | 7.808856 | 193.553212 | 17329.802160 | 8.061362 | NaN | 392.449580 | 19.903225 | NaN | 2.798243 | 1 |
| 3273 | 9.419510 | 175.762646 | 33155.578218 | 7.350233 | NaN | 432.044783 | 11.039070 | 69.845400 | 3.298875 | 1 |
| 3274 | 5.126763 | 230.603758 | 11983.869376 | 6.303357 | NaN | 402.883113 | 11.168946 | 77.488213 | 4.708658 | 1 |
| 3275 | 7.874671 | 195.102299 | 17404.177061 | 7.509306 | NaN | 327.459760 | 16.140368 | 78.698446 | 2.309149 | 1 |
3276 rows × 10 columns
df.describe()
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 2785.000000 | 3276.000000 | 3276.000000 | 3276.000000 | 2495.000000 | 3276.000000 | 3276.000000 | 3114.000000 | 3276.000000 | 3276.000000 |
| mean | 7.080795 | 196.369496 | 22014.092526 | 7.122277 | 333.775777 | 426.205111 | 14.284970 | 66.396293 | 3.966786 | 0.390110 |
| std | 1.594320 | 32.879761 | 8768.570828 | 1.583085 | 41.416840 | 80.824064 | 3.308162 | 16.175008 | 0.780382 | 0.487849 |
| min | 0.000000 | 47.432000 | 320.942611 | 0.352000 | 129.000000 | 181.483754 | 2.200000 | 0.738000 | 1.450000 | 0.000000 |
| 25% | 6.093092 | 176.850538 | 15666.690297 | 6.127421 | 307.699498 | 365.734414 | 12.065801 | 55.844536 | 3.439711 | 0.000000 |
| 50% | 7.036752 | 196.967627 | 20927.833607 | 7.130299 | 333.073546 | 421.884968 | 14.218338 | 66.622485 | 3.955028 | 0.000000 |
| 75% | 8.062066 | 216.667456 | 27332.762127 | 8.114887 | 359.950170 | 481.792304 | 16.557652 | 77.337473 | 4.500320 | 1.000000 |
| max | 14.000000 | 323.124000 | 61227.196008 | 13.127000 | 481.030642 | 753.342620 | 28.300000 | 124.000000 | 6.739000 | 1.000000 |
d= pd.DataFrame(df['Potability'].value_counts())
fig = px.pie(d,values='Potability',names=['Not Potable','Potable'],hole=0.4,opacity=0.6,
color_discrete_sequence=['#74C365','#51C4D3'],
labels={'label':'Potability','Potability':'No. Of Samples'})
fig.show()
non_potable = df.query("Potability == 0")
potable = df.query("Potability == 1")
plt.figure(figsize = (15, 15))
for ax, col in enumerate(df.columns[:9]):
plt.subplot(3, 3, ax + 1)
plt.title(col)
sns.kdeplot(x = non_potable[col], label = "Non Potable",color='#4F7942')
sns.kdeplot(x = potable[col], label = "Potable",color='#51C4D3')
plt.legend()
plt.tight_layout()
matrix = np.triu(df.corr())
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True, fmt= '.3f', mask=matrix,cmap='Blues')
<AxesSubplot:>
plt.figure(figsize=(7, 10))
heatmap = sns.heatmap(df.corr()[['Potability']].sort_values(by='Potability', ascending=False),annot=True, cmap='GnBu_r')
plt.title('Correlation with Potability',pad=20, fontsize=16)
Text(0.5, 1.0, 'Correlation with Potability')
##### MISSING VALUES #####
df.isnull().sum()
ph 491 Hardness 0 Solids 0 Chloramines 0 Sulfate 781 Conductivity 0 Organic_carbon 0 Trihalomethanes 162 Turbidity 0 Potability 0 dtype: int64
mis_colors = []
for col in df.columns:
if df[col].isna().sum() != 0:
mis_colors.append('#51C4D3')
else:
mis_colors.append('gray')
msn.bar(df, color=mis_colors)
plt.title('Missing values (before)', size=40, y=1.15)
Text(0.5, 1.15, 'Missing values (before)')
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=10, weights="uniform")
l=imputer.fit_transform(df)
df=pd.DataFrame(l,columns=df.columns)
mis_colors_after = []
for col in df.columns:
if df[col].isna().sum() != 0:
mis_colors_after.append('#51C4D3')
else:
mis_colors_after.append('#74C365')
msn.bar(df, color=mis_colors_after)
plt.title('Missing values (after)', size=45, y=1.15)
Text(0.5, 1.15, 'Missing values (after)')
i=1
plt.figure(figsize=(17,25))
for feature in df.columns.drop('Potability'):
plt.subplot(6,3,i)
sns.boxplot(y=df[feature])
i+=1
X = df.drop('Potability',axis=1)
y = df['Potability'].values
l=[]
names=[]
for col in X.columns:
names.append(col)
l.append(f'{math.floor(X[col].min())} to {math.ceil(X[col].max())}')
tab = pd.DataFrame(list(zip(names,l)),columns =['Name', 'Range'])
print(tb.tabulate(tab, headers='keys', tablefmt='pretty'))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
+---+-----------------+--------------+ | | Name | Range | +---+-----------------+--------------+ | 0 | ph | 0 to 14 | | 1 | Hardness | 47 to 324 | | 2 | Solids | 320 to 61228 | | 3 | Chloramines | 0 to 14 | | 4 | Sulfate | 129 to 482 | | 5 | Conductivity | 181 to 754 | | 6 | Organic_carbon | 2 to 29 | | 7 | Trihalomethanes | 0 to 124 | | 8 | Turbidity | 1 to 7 | +---+-----------------+--------------+
#As there are variations between the ranges of the column values, scaling is necessary.
scaler=StandardScaler()
scaler.fit(X_train)
X_train=pd.DataFrame(scaler.transform(X_train))
X_test=pd.DataFrame(scaler.transform(X_test))
from sklearn.metrics import classification_report
param_grid_svc = {'C': [0.01, 0.05, 0.1, 0.5, 1.0, 10.0],'kernel': ['linear','rbf']}
# Create an instance of GridSearch Cross-validation estimator
model=SVC()
gsSVC = GridSearchCV(estimator= model,
param_grid = param_grid_svc,
scoring='accuracy',
cv=10,
refit=True)
# Train the SVM classifier
gsSVC.fit(X_train, y_train)
# Print the model parameters of the best model
print(gsSVC.best_params_)
# Print the model score on the test data using GridSearchCV score method
print('Test accuracy: %.3f' % gsSVC.score(X_test, y_test))
print(classification_report(y_test,gsSVC.predict(X_test)))
{'C': 1.0, 'kernel': 'rbf'}
Test accuracy: 0.674
precision recall f1-score support
0.0 0.67 0.93 0.78 603
1.0 0.71 0.27 0.39 380
accuracy 0.67 983
macro avg 0.69 0.60 0.58 983
weighted avg 0.68 0.67 0.63 983
#then calculate the mean and standard deviation of the scores
from sklearn.model_selection import cross_val_score
scores = cross_val_score(SVC(C=1.0,kernel='rbf'), X, y, cv=5,scoring='accuracy')
print(scores)
print("%0.2f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))
[0.6097561 0.61068702 0.61068702 0.60916031 0.60916031] 0.61 accuracy with a standard deviation of 0.001
from sklearn.neural_network import MLPClassifier
mlp_gs = MLPClassifier(max_iter=1000)
parameter_space = {
'hidden_layer_sizes': [(10,30,10),(20,)],
'activation': ['tanh', 'relu'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
}
mlp = GridSearchCV(estimator=mlp_gs,
param_grid= parameter_space,
n_jobs=-1,
cv=5,
refit=True)
# Train the SVM classifier
mlp.fit(X_train, y_train)
print(mlp.best_params_)
# Print the model score on the test data using GridSearchCV score method
print('Test accuracy: %.3f' % mlp.score(X_test, y_test))
predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)
print(classification_report(y_test,mlp.predict(X_test)))
{'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (20,), 'learning_rate': 'constant'}
Test accuracy: 0.667
precision recall f1-score support
0.0 0.69 0.83 0.75 603
1.0 0.60 0.42 0.49 380
accuracy 0.67 983
macro avg 0.65 0.62 0.62 983
weighted avg 0.66 0.67 0.65 983
scores_mlp = cross_val_score(MLPClassifier(max_iter=1000,activation= 'relu', alpha= 0.05, hidden_layer_sizes= (20,), learning_rate= 'constant'), X, y, cv=5,scoring='accuracy')
print(scores_mlp)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_mlp.mean(), scores_mlp.std()))
[0.60365854 0.61068702 0.55877863 0.55877863 0.44427481] 0.56 accuracy with a standard deviation of 0.06
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
param_grid_forest = {'max_features':[2, 3, 4, 6,8],'max_depth':[2, 3, 4,10]}
# Create an instance of GridSearch Cross-validation estimator
forest=RandomForestClassifier(n_estimators= 100,criterion='gini', random_state=0)
gsForest= GridSearchCV(estimator= forest,
param_grid = param_grid_forest,
scoring='accuracy',
cv=10,
refit=True)
#scores = cross_val_score(gsSVC, X, y, cv=5,scoring='accuracy')
#print(scores)
# Train the SVM classifier
gsForest.fit(X_train, y_train)
# Print the model parameters of the best model
print(gsForest.best_params_)
# Print the model score on the test data using GridSearchCV score method
print('Test accuracy: %.3f' % gsForest.score(X_test, y_test))
print(classification_report(y_test,gsForest.predict(X_test)))
{'max_depth': 10, 'max_features': 6}
Test accuracy: 0.678
precision recall f1-score support
0.0 0.67 0.92 0.78 603
1.0 0.70 0.29 0.41 380
accuracy 0.68 983
macro avg 0.69 0.61 0.59 983
weighted avg 0.68 0.68 0.64 983
scores_forest = cross_val_score(RandomForestClassifier(n_estimators= 100,criterion='gini', random_state=0,max_depth= 10, max_features= 6), X, y, cv=5,scoring='accuracy')
print(scores_forest)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_forest.mean(), scores_forest.std()))
[0.6097561 0.64885496 0.65648855 0.61526718 0.66564885] 0.64 accuracy with a standard deviation of 0.02